# Packages
library(tidyverse)
library(lubridate)
library(plotly)
library(sqldf)
library(corrplot)
library(rpart)
library(caret)
library(rpart.plot)
library(party)
library(randomForest)
library(e1071)
library(neuralnet)
# Loading dataset

bikes <- read.csv("SeoulBikeData.csv", header = TRUE, sep = ",")

str(bikes)
## 'data.frame':    8760 obs. of  14 variables:
##  $ Date                     : Factor w/ 365 levels "01/01/2018","01/02/2018",..: 12 12 12 12 12 12 12 12 12 12 ...
##  $ Rented.Bike.Count        : int  254 204 173 107 78 100 181 460 930 490 ...
##  $ Hour                     : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ Temperature..C.          : num  -5.2 -5.5 -6 -6.2 -6 -6.4 -6.6 -7.4 -7.6 -6.5 ...
##  $ Humidity...              : int  37 38 39 40 36 37 35 38 37 27 ...
##  $ Wind.speed..m.s.         : num  2.2 0.8 1 0.9 2.3 1.5 1.3 0.9 1.1 0.5 ...
##  $ Visibility..10m.         : int  2000 2000 2000 2000 2000 2000 2000 2000 2000 1928 ...
##  $ Dew.point.temperature..C.: num  -17.6 -17.6 -17.7 -17.6 -18.6 -18.7 -19.5 -19.3 -19.8 -22.4 ...
##  $ Solar.Radiation..MJ.m2.  : num  0 0 0 0 0 0 0 0 0.01 0.23 ...
##  $ Rainfall.mm.             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Snowfall..cm.            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Seasons                  : Factor w/ 4 levels "Autumn","Spring",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ Holiday                  : Factor w/ 2 levels "Holiday","No Holiday": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Functioning.Day          : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
summary(bikes)
##          Date      Rented.Bike.Count      Hour       Temperature..C. 
##  01/01/2018:  24   Min.   :   0.0    Min.   : 0.00   Min.   :-17.80  
##  01/02/2018:  24   1st Qu.: 191.0    1st Qu.: 5.75   1st Qu.:  3.50  
##  01/03/2018:  24   Median : 504.5    Median :11.50   Median : 13.70  
##  01/04/2018:  24   Mean   : 704.6    Mean   :11.50   Mean   : 12.88  
##  01/05/2018:  24   3rd Qu.:1065.2    3rd Qu.:17.25   3rd Qu.: 22.50  
##  01/06/2018:  24   Max.   :3556.0    Max.   :23.00   Max.   : 39.40  
##  (Other)   :8616                                                     
##   Humidity...    Wind.speed..m.s. Visibility..10m. Dew.point.temperature..C.
##  Min.   : 0.00   Min.   :0.000    Min.   :  27     Min.   :-30.600          
##  1st Qu.:42.00   1st Qu.:0.900    1st Qu.: 940     1st Qu.: -4.700          
##  Median :57.00   Median :1.500    Median :1698     Median :  5.100          
##  Mean   :58.23   Mean   :1.725    Mean   :1437     Mean   :  4.074          
##  3rd Qu.:74.00   3rd Qu.:2.300    3rd Qu.:2000     3rd Qu.: 14.800          
##  Max.   :98.00   Max.   :7.400    Max.   :2000     Max.   : 27.200          
##                                                                             
##  Solar.Radiation..MJ.m2.  Rainfall.mm.     Snowfall..cm.       Seasons    
##  Min.   :0.0000          Min.   : 0.0000   Min.   :0.00000   Autumn:2184  
##  1st Qu.:0.0000          1st Qu.: 0.0000   1st Qu.:0.00000   Spring:2208  
##  Median :0.0100          Median : 0.0000   Median :0.00000   Summer:2208  
##  Mean   :0.5691          Mean   : 0.1487   Mean   :0.07507   Winter:2160  
##  3rd Qu.:0.9300          3rd Qu.: 0.0000   3rd Qu.:0.00000                
##  Max.   :3.5200          Max.   :35.0000   Max.   :8.80000                
##                                                                           
##        Holiday     Functioning.Day
##  Holiday   : 432   No : 295       
##  No Holiday:8328   Yes:8465       
##                                   
##                                   
##                                   
##                                   
## 
#### Descriptive analysis ####

# Unique variables of humidity

c(unique(bikes["Humidity..."]))
## $Humidity...
##  [1] 37 38 39 40 36 35 27 24 21 23 25 26 54 58 66 77 79 81 83 84 87 86 82 68 57
## [26] 49 41 48 51 53 52 55 56 69 71 73 75 91 92 89 85 76 90 88 47 30 29 32 43 45
## [51] 44 42 34 33 31 28 46 59 78 70 64 60 94 93 96 65 50 74 63 61 72 62 22 67 80
## [76] 95 15 20 17 18 16 19 14 97 98 10 13 12 11  0
# Unique variables of hour

c(unique(bikes["Hour"]))
## $Hour
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
# Transforming factor date column into date

bikes$Date <- as.POSIXct(bikes$Date, format = "%d/%m/%Y")

bikes$Date <- as.Date(bikes$Date)
# Bikes rented over time plot

a <- ggplot(data = bikes) +
  geom_line(aes(x = Date, y = Rented.Bike.Count), 
            color = "#09557f",
            alpha = 0.6,
            size = 0.6) +
  theme_minimal() +
  labs(x = "Date", 
       y = "Bikes" ,
       title = "Base Plot")

a <- ggplotly(a)
a
#### Exploratory analysis ####

# Bikes rented per season plot

b <- ggplot(data = bikes) +
  geom_line(aes(x = Date, y = Rented.Bike.Count, col = Seasons), 
            alpha = 0.6,
            size = 0.6) +
  scale_color_manual(labels = c("Autumn", "Spring", "Summer", "Winter"),
                     values = c("red", "green", "yellow", "blue")) +
  theme_minimal() +
  labs(x = "Date", 
       y = "Bikes" ,
       title = "Bikes rented per seasons ")

b <- ggplotly(b)
b
# Bikes rented overtime in holidays and normal days plot

c <- ggplot(data = bikes) +
  geom_line(aes(x = Date, y = Rented.Bike.Count, col = Holiday), 
            alpha = 0.6,
            size = 0.6) +
  theme_minimal() +
  labs(x = "Date", 
       y = "Bikes" ,
       title = "Bikes rented holiday and normal days")

c <- ggplotly(c)
c
# Rainfall and solar radiation in different seasons in the bike shops plot

d <- ggplot(data = bikes) +
  geom_line(aes(x = Date, y = Rainfall.mm., col = Seasons), 
            alpha = 0.6,
            size = 0.6) +
  geom_line(aes(x = Date, y = Solar.Radiation..MJ.m2.), 
            alpha = 0.6,
            size = 0.6, color = "orange") +
  scale_color_manual(labels = c("Autumn", "Spring", "Summer", "Winter"),
                     values = c("red", "green", "yellow", "blue")) +
  theme_minimal() +
  labs(x = "Date", 
       y = "Rainfall mm and solar radiation" ,
       title = "Rainfall and solar radiation in different seasons in the bike shops")

d <- ggplotly(d)
d
e <- ggplot(data = bikes) +
  geom_line(aes(x = Date, y = Temperature..C., col = Seasons), 
            alpha = 0.6,
            size = 0.6) +
  geom_line(aes(x = Date, y = Wind.speed..m.s.), 
            alpha = 0.6,
            size = 0.6, color = "grey") +
  scale_color_manual(labels = c("Autumn", "Spring", "Summer", "Winter"),
                     values = c("red", "green", "yellow", "blue")) +
  theme_minimal() +
  labs(x = "Date", 
       y = "Temperature in C and wind speed" ,
       title = "Temperature and wind speed in different seasons in the bike shops")


e <- ggplotly(e)
e
#### Pre processing ####

# Attribute enginnering the seasons column

# transforming all seasons in categorical variables

all_seasons_col = 1 # Winter

all_seasons_col = 2 # Spring

all_seasons_col = 3 # Summer

all_seasons_col = 4 # Autumn
# sorting winter only column

winter_s <- sqldf(
  
  "SELECT * FROM bikes WHERE Seasons LIKE '%Winter' "
)

winter_s <- cbind(winter_s, all_seasons_col)

# sorting spring only column

Spring_s <- sqldf(
  
  "SELECT * FROM bikes WHERE Seasons LIKE '%Spring' "
)


Spring_s <- cbind(Spring_s, all_seasons_col)

# sorting autumn only column

Autumn_S <- sqldf(
  
  "SELECT * FROM bikes WHERE Seasons LIKE '%Autumn' "
)



Autumn_S <- cbind(Autumn_S, all_seasons_col)

# sorting summer only column

Summer_s <- sqldf(
  
  "SELECT * FROM bikes WHERE Seasons LIKE '%Summer' "
)

Summer_s <- cbind(Summer_s, all_seasons_col)
# merging all seasons in the same dataframe

all_seasons <- merge(winter_s, Spring_s, all = TRUE)

all_seasons2 <- merge(Autumn_S, Summer_s, all = TRUE)

all_seasons_final <- merge(all_seasons, all_seasons2, all = TRUE)

# variables correlations plot for feature selection

all_seasons_final$Date <- NULL
all_seasons_final$Seasons<- NULL
all_seasons_final$Holiday <- NULL
all_seasons_final$Functioning.Day <- NULL

correlations <- cor(all_seasons_final,method="pearson")
## Warning in cor(all_seasons_final, method = "pearson"): o desvio padrão é zero
corrplot(correlations, number.cex = .9, method = "circle", type = "full", tl.cex=0.8,tl.col = "black",
         title = "bikes")

# removing not important columns for machine learning

all_seasons_final$Functioning.Day <- NULL
all_seasons_final$Hour <- NULL
all_seasons_final$Dew.point.temperature..C. <- NULL
# converting some columns variables types

all_seasons_final$Visibility..10m. <- as.numeric(all_seasons_final$Visibility..10m.)

all_seasons_final$Humidity... <- as.numeric(all_seasons_final$Humidity...)

all_seasons_final$Rented.Bike.Count <- as.numeric(all_seasons_final$Rented.Bike.Count)
# separating data into train and test

indexes <- sample(1:nrow(all_seasons_final), size = 0.7 * nrow(all_seasons_final))
train.data.bikes <- all_seasons_final[indexes,]
test.data.bikes <- all_seasons_final[-indexes,]
class(train.data.bikes)
## [1] "data.frame"
class(test.data.bikes)
## [1] "data.frame"
str(train.data.bikes)
## 'data.frame':    6132 obs. of  9 variables:
##  $ Rented.Bike.Count      : num  1076 561 490 79 322 ...
##  $ Temperature..C.        : num  30.7 22.2 23.7 -3.6 28.3 -9 1.3 2.9 3 27.3 ...
##  $ Humidity...            : num  55 84 76 78 84 54 67 45 26 49 ...
##  $ Wind.speed..m.s.       : num  1.5 1.1 1.9 1.6 0.5 0.6 1.2 1.6 2 0.9 ...
##  $ Visibility..10m.       : num  2000 482 1281 471 1006 ...
##  $ Solar.Radiation..MJ.m2.: num  0 0 0.08 0 0 0 0 0.04 1.01 1.17 ...
##  $ Rainfall.mm.           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Snowfall..cm.          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ all_seasons_col        : num  4 4 4 4 4 4 4 4 4 4 ...
#### Machine learning ####

# Creating linear regression machine learning model 


modelo <- lm(Rented.Bike.Count ~. , data = train.data.bikes)


# prevision for the testing data

prevision1 <- predict(modelo, test.data.bikes)
## Warning in predict.lm(modelo, test.data.bikes): prediction from a rank-deficient
## fit may be misleading
summary(modelo)
## 
## Call:
## lm(formula = Rented.Bike.Count ~ ., data = train.data.bikes)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1382.44  -301.81   -50.63   221.97  2358.07 
## 
## Coefficients: (1 not defined because of singularities)
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              874.66896   43.88692  19.930   <2e-16 ***
## Temperature..C.           35.77723    0.66219  54.029   <2e-16 ***
## Humidity...              -11.04399    0.48269 -22.880   <2e-16 ***
## Wind.speed..m.s.          59.16871    6.80406   8.696   <2e-16 ***
## Visibility..10m.          -0.01271    0.01299  -0.978   0.3281    
## Solar.Radiation..MJ.m2. -115.22038    9.97684 -11.549   <2e-16 ***
## Rainfall.mm.             -57.34208    6.29146  -9.114   <2e-16 ***
## Snowfall..cm.             39.25025   15.86454   2.474   0.0134 *  
## all_seasons_col                 NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 496.8 on 6124 degrees of freedom
## Multiple R-squared:  0.4078, Adjusted R-squared:  0.4072 
## F-statistic: 602.5 on 7 and 6124 DF,  p-value: < 2.2e-16
# making plot of real and predictive data

x = 1:length(test.data.bikes$Rented.Bike.Count)

plot(x, test.data.bikes$Rented.Bike.Count, col = "red", type = "l", lwd=2,
     main = "Bikes selling real and predictive data (linear regression)")
lines(x, prevision1, col = "blue", lwd=2)
legend("topleft",  legend = c("real data", "predictive data"), 
       fill = c("red", "blue"), col = 2:3,  adj = c(0, 0.6))
grid()

# decision tree model

ma_model_tree <- rpart(Rented.Bike.Count ~., 
                     data = train.data.bikes)

rpart.plot(ma_model_tree)

pred_model <- predict(ma_model_tree, test.data.bikes)

printcp(ma_model_tree)
## 
## Regression tree:
## rpart(formula = Rented.Bike.Count ~ ., data = train.data.bikes)
## 
## Variables actually used in tree construction:
## [1] Humidity...             Solar.Radiation..MJ.m2. Temperature..C.        
## 
## Root node error: 2552796844/6132 = 416307
## 
## n= 6132 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.240967      0   1.00000 1.00020 0.021192
## 2 0.099732      1   0.75903 0.76114 0.016797
## 3 0.028707      2   0.65930 0.66623 0.015415
## 4 0.027141      3   0.63059 0.64896 0.014841
## 5 0.019555      4   0.60345 0.61083 0.014225
## 6 0.018036      5   0.58390 0.59219 0.013955
## 7 0.015525      6   0.56586 0.59105 0.013974
## 8 0.010000      7   0.55034 0.56484 0.013858
# making plot of real and predictive data

x = 1:length(test.data.bikes$Rented.Bike.Count)

plot(x, test.data.bikes$Rented.Bike.Count, col = "red", type = "l", lwd=2,
     main = "Bikes selling real and predictive data (decision tree)")
lines(x, pred_model, col = "blue", lwd=2)
legend("topleft",  legend = c("real data", "predictive data"), 
       fill = c("red", "blue"), col = 2:3,  adj = c(0, 0.6))
grid()

# Randomforest model

ma_model <- randomForest(Rented.Bike.Count ~., data = train.data.bikes)

print(ma_model)
## 
## Call:
##  randomForest(formula = Rented.Bike.Count ~ ., data = train.data.bikes) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 182022.6
##                     % Var explained: 56.28
plot(ma_model)

summary(ma_model)
##                 Length Class  Mode     
## call               3   -none- call     
## type               1   -none- character
## predicted       6132   -none- numeric  
## mse              500   -none- numeric  
## rsq              500   -none- numeric  
## oob.times       6132   -none- numeric  
## importance         8   -none- numeric  
## importanceSD       0   -none- NULL     
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            11   -none- list     
## coefs              0   -none- NULL     
## y               6132   -none- numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call
pred_model <- predict(ma_model, test.data.bikes)
summary(ma_model)
##                 Length Class  Mode     
## call               3   -none- call     
## type               1   -none- character
## predicted       6132   -none- numeric  
## mse              500   -none- numeric  
## rsq              500   -none- numeric  
## oob.times       6132   -none- numeric  
## importance         8   -none- numeric  
## importanceSD       0   -none- NULL     
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            11   -none- list     
## coefs              0   -none- NULL     
## y               6132   -none- numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call
# making plot of real and predictive data

x = 1:length(test.data.bikes$Rented.Bike.Count)

plot(x, test.data.bikes$Rented.Bike.Count, col = "red", type = "l", lwd=2,
     main = "Bikes selling real and predictive data (random forest)")
lines(x, pred_model, col = "blue", lwd=2)
legend("topleft",  legend = c("real data", "predictive data"), 
       fill = c("red", "blue"), col = 2:3,  adj = c(0, 0.6))
grid()

# knn control archive

ctrl <- trainControl(method = "repeatedcv", repeats = 3) 

# knn model
knn_v1 <- train(Rented.Bike.Count ~ ., 
                data = train.data.bikes, 
                method = "knn", 
                trControl = ctrl, 
                tuneLength = 20)
knn_v1
## k-Nearest Neighbors 
## 
## 6132 samples
##    8 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 5520, 5518, 5519, 5518, 5518, 5518, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  503.1865  0.4032724  355.5974
##    7  496.9177  0.4119199  353.5838
##    9  491.1785  0.4223761  352.1396
##   11  490.4463  0.4233921  353.5994
##   13  489.4242  0.4254915  354.3774
##   15  490.7260  0.4226356  356.4222
##   17  492.5776  0.4185130  358.6990
##   19  494.4054  0.4145140  361.1090
##   21  496.1044  0.4108650  363.5777
##   23  497.4366  0.4080626  365.3939
##   25  498.7921  0.4051974  367.0448
##   27  499.9006  0.4029197  368.3748
##   29  501.6528  0.3990586  370.4802
##   31  503.2077  0.3956401  372.1997
##   33  504.7390  0.3922293  374.0067
##   35  506.3056  0.3885574  375.6480
##   37  507.6085  0.3855883  377.3169
##   39  508.7033  0.3831809  378.9777
##   41  510.0202  0.3800894  380.4712
##   43  511.1313  0.3775884  381.8409
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 13.
pred_model <- predict(knn_v1, test.data.bikes)
# making plot of real and predictive data

x = 1:length(test.data.bikes$Rented.Bike.Count)

plot(x, test.data.bikes$Rented.Bike.Count, col = "red", type = "l", lwd=2,
     main = "Bikes selling real and predictive data (knn)")
lines(x, pred_model, col = "blue", lwd=2)
legend("topleft",  legend = c("real data", "predictive data"), 
       fill = c("red", "blue"), col = 2:3,  adj = c(0, 0.6))
grid()